In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from munging import session
from munging import transform
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
In [23]:
## load data
data = pd.read_csv("data/amazon_employee_access/train.csv")
## make ids recognized as categorical data
for f in data.columns[1:]:
data[f] = data[f].astype(np.str)
data.head(3)
Out[23]:
In [25]:
## exploration session
dsession = session.Session(data, "ACTION", random_state=0)
transformers = []
print dsession.get_parameters()
In [26]:
## numerical and categorical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)
In [28]:
## knowing what you are dealing with
pd.value_counts(data.ACTION) * 1./ data.shape[0]
Out[28]:
In [ ]: